Assignment 2¶

Collaborators: Aldrin Ilagan, Lindsey Rappaport, Yvanna Cardenas¶

In [5]:
# Import packages (same from Workbook 3)
import glob
import random
from typing import List
from collections import defaultdict

import numpy as np
from numpy.random import choice

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from symusic import Score
from miditok import REMI, TokenizerConfig
/Users/ycardenas/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
  warnings.warn(
/Users/ycardenas/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [ ]:
random.seed(42)

Symbolic, unconditioned generation¶

In [ ]:
# Parse data
file_path = './maestro-v3.0.0/'
test_files = []
# Set train files to years 2004-2011
train_files = glob.glob(file_path + '2004/*.midi')
train_files += glob.glob(file_path + '2006/*.midi')
train_files += glob.glob(file_path + '2008/*.midi')
train_files += glob.glob(file_path + '2009/*.midi')       # including the rest makes training take 2 min / epoch (very slow!)
train_files += glob.glob(file_path + '2011/*.midi')

# Set test files to years 2013-2018
test_files = glob.glob(file_path + '2013/*.midi')
test_files += glob.glob(file_path + '2014/*.midi')
test_files += glob.glob(file_path + '2015/*.midi')
test_files += glob.glob(file_path + '2017/*.midi')
test_files += glob.glob(file_path + '2018/*.midi')

print(len(train_files))
print(len(test_files))
In [ ]:
# Build Markov chain from training data
from miditok import REMI
from miditoolkit import MidiFile
from collections import defaultdict, Counter
import glob

# Initialize REMI tokenizer
tokenizer = REMI()

# Get all training MIDI file paths from MAESTRO v3.0.0 (years 2004–2011)
train_files = glob.glob('./maestro-v3.0.0/**/**/*.midi', recursive=True)
train_files = [f for f in train_files if any(str(y) in f for y in range(2004, 2012))]

# Initialize transition counter: token -> next_token -> count
transitions = defaultdict(Counter)

# Iterate over MIDI files and count token transitions
for file in train_files:
    encodings = tokenizer.encode(file)  # returns list of Encoding objects
    for encoding in encodings:
        ids = encoding.ids
        for i in range(len(ids) - 1):
            transitions[ids[i]][ids[i + 1]] += 1

# Normalize transition counts into probabilities
markov_model = {
    k: {kk: vv / sum(v.values()) for kk, vv in v.items()}
    for k, v in transitions.items()
}

print(f"Built Markov model with {len(markov_model)} unique starting tokens.")
In [9]:
# Define function to generate a sequence using the Markov chain model
def generate_markov_sequence(start_token, length=200):
    sequence = [start_token]
    for _ in range(length - 1):
        curr = sequence[-1]
        next_tokens = markov_model.get(curr, {})
        if not next_tokens:
            break
        next_token = random.choices(
            list(next_tokens.keys()), weights=list(next_tokens.values()), k=1
        )[0]
        sequence.append(next_token)
    return sequence
In [18]:
from midi2audio import FluidSynth # Import library
from IPython.display import Audio, display
fs = FluidSynth("FluidR3Mono_GM.sf3") # Initialize FluidSynth for MIDI to ausio convertion
In [ ]:
from miditok import TokSequence

# Generate a sequence using the Markov chain
start = random.choice(list(markov_model.keys()))
generated_ids = generate_markov_sequence(start)

# Wrap in a TokSequence and decode to ScoreTick
seq = TokSequence(ids=generated_ids)

try:
    score = tokenizer.decode([seq])  # returns symusic ScoreTick
    score.dump_midi("markov.mid")  # save directly to MIDI
    fs.midi_to_audio("markov.mid", "markov.wav")
    print("Saved markov.mid ✅")
except KeyError as e:
    print("❌ Decode failed due to invalid token ID:", e)
FluidSynth runtime version 2.4.6
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file 'symbolic_unconditioned.wav'..
Saved symbolic_unconditioned.mid ✅
In [22]:
from miditok.pytorch_data import DatasetMIDI, DataCollator

tokenizer = REMI()  # using defaults parameters (constants.py)
train_dataset = DatasetMIDI(
    files_paths=train_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
test_dataset = DatasetMIDI(
    files_paths=test_files,
    tokenizer=tokenizer,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
collator = DataCollator(tokenizer.pad_token_id)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collator)

Define RNN model (LSTM)¶

In [24]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(MusicRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # x: (batch_size, seq_length)
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(x, hidden)  # out: (batch_size, seq_length, hidden_dim)
        out = self.fc(out)  # (batch_size, seq_length, vocab_size)
        return out, hidden

Define training function¶

In [25]:
def train(model, train_loader, val_loader, vocab_size, num_epochs=10, lr=0.001, device='mps'): # change device depending on system
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # --------- Training ---------
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            batch = batch['input_ids'].to(device)  # (batch_size, seq_length)

            inputs = batch[:, :-1]
            targets = batch[:, 1:]

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            outputs = outputs.reshape(-1, vocab_size)
            targets = targets.reshape(-1)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # --------- Validation ---------
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch['input_ids'].to(device)

                inputs = batch[:, :-1]
                targets = batch[:, 1:]

                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, vocab_size)
                targets = targets.reshape(-1)

                loss = criterion(outputs, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


# Example usage
if __name__ == "__main__":
    vocab_size = tokenizer.vocab_size
    embedding_dim = 256
    hidden_dim = 512
    num_layers = 2

    model = MusicRNN(vocab_size, embedding_dim, hidden_dim, num_layers)
    train(model, train_loader, test_loader, vocab_size)
Epoch 1/10 | Train Loss: 3.1702 | Val Loss: 2.6880
Epoch 2/10 | Train Loss: 2.5999 | Val Loss: 2.5081
Epoch 3/10 | Train Loss: 2.4480 | Val Loss: 2.4078
Epoch 4/10 | Train Loss: 2.3212 | Val Loss: 2.3390
Epoch 5/10 | Train Loss: 2.2051 | Val Loss: 2.3059
Epoch 6/10 | Train Loss: 2.0832 | Val Loss: 2.2970
Epoch 7/10 | Train Loss: 1.9584 | Val Loss: 2.3091
Epoch 8/10 | Train Loss: 1.8204 | Val Loss: 2.3500
Epoch 9/10 | Train Loss: 1.6794 | Val Loss: 2.4106
Epoch 10/10 | Train Loss: 1.5352 | Val Loss: 2.4966

Define sampling function¶

In [27]:
def sample(model, start_token, max_length=100, temperature=1.0, device='mps'):
    model = model.to(device)
    model.eval()

    generated = [start_token]
    input_token = torch.tensor([[start_token]], device=device)  # (1, 1)

    hidden = None

    for _ in range(max_length):
        output, hidden = model(input_token, hidden)  # output: (1, 1, vocab_size)
        output = output[:, -1, :]  # take the last output
        output = output / temperature  # adjust randomness

        probs = F.softmax(output, dim=-1)  # (1, vocab_size)
        next_token = torch.multinomial(probs, num_samples=1).item()
        generated.append(next_token)
        if next_token == 2 or next_token == 0: # reach end of sequence
          break

        input_token = torch.tensor([[next_token]], device=device)

    return generated

start_token = tokenizer.special_tokens_ids[1]
generated_sequence = sample(model, start_token, max_length=1024)

print("Generated token sequence:")
print(generated_sequence)
Generated token sequence:
[1, 4, 206, 49, 101, 130, 208, 50, 101, 125, 215, 47, 99, 125, 219, 50, 97, 131, 54, 103, 127, 4, 192, 57, 104, 145, 205, 49, 101, 132, 206, 45, 101, 127, 213, 52, 101, 133, 215, 49, 100, 133, 40, 99, 132, 220, 45, 101, 133, 4, 189, 42, 97, 132, 198, 45, 99, 136, 203, 42, 99, 126, 208, 54, 103, 129, 212, 50, 96, 126, 215, 47, 98, 128, 220, 54, 105, 146, 4, 192, 46, 99, 129, 196, 50, 101, 130, 201, 54, 104, 134, 206, 53, 105, 133, 209, 49, 100, 127, 213, 55, 102, 129, 214, 50, 99, 126, 217, 54, 104, 127, 219, 50, 101, 126, 4, 189, 47, 99, 128, 54, 102, 127, 191, 38, 98, 128, 193, 50, 99, 128, 198, 49, 104, 129, 200, 45, 98, 128, 42, 98, 128, 205, 54, 103, 134, 50, 102, 133, 206, 21, 101, 128, 213, 21, 100, 127, 217, 54, 104, 127, 219, 105, 102, 128, 4, 191, 53, 103, 127, 47, 100, 125, 194, 54, 102, 126, 197, 35, 101, 135, 42, 99, 128, 200, 56, 95, 127, 204, 51, 99, 126, 43, 99, 127, 207, 54, 104, 136, 211, 45, 98, 128, 213, 35, 99, 131, 51, 97, 125, 215, 52, 96, 131, 218, 48, 100, 125, 220, 42, 98, 127, 4, 192, 54, 104, 141, 197, 39, 101, 147, 49, 100, 126, 200, 42, 98, 126, 202, 45, 102, 125, 204, 49, 101, 127, 208, 54, 104, 133, 209, 49, 98, 134, 212, 47, 102, 126, 215, 38, 101, 125, 35, 99, 133, 218, 54, 107, 131, 220, 50, 103, 125, 4, 191, 35, 101, 137, 55, 106, 131, 194, 52, 105, 127, 196, 55, 110, 127, 198, 52, 107, 127, 201, 46, 104, 129, 35, 101, 132, 205, 54, 109, 125, 42, 102, 126, 207, 48, 104, 125, 208, 52, 110, 126, 48, 102, 125, 213, 57, 111, 126, 35, 102, 129, 44, 97, 127, 214, 53, 102, 125, 215, 37, 99, 127, 4, 190, 59, 107, 127, 40, 102, 125, 191, 50, 104, 125, 47, 103, 125, 193, 61, 109, 127, 195, 49, 103, 127, 197, 61, 108, 127, 198, 59, 98, 127, 200, 46, 100, 126, 203, 37, 100, 129, 205, 61, 108, 127, 206, 52, 101, 125, 208, 49, 100, 126, 211, 62, 108, 127, 35, 102, 128, 49, 101, 125, 212, 40, 103, 125, 215, 47, 105, 126, 45, 105, 126, 217, 59, 109, 127, 218, 52, 105, 126, 220, 54, 107, 126, 44, 104, 126, 4, 189, 55, 108, 132, 190, 45, 99, 126, 191, 47, 105, 127, 192, 59, 107, 125, 47, 106, 125, 194, 57, 108, 127, 52, 105, 125, 195, 43, 106, 125, 197, 49, 105, 126, 57, 108, 126, 198, 61, 108, 129, 49, 102, 125, 199, 52, 103, 125, 200, 59, 106, 126, 201, 61, 95, 125, 202, 38, 102, 126, 204, 45, 108, 129, 49, 109, 127, 40, 104, 131, 206, 59, 108, 126, 208, 58, 109, 125, 37, 109, 130, 210, 57, 109, 125, 211, 45, 108, 129, 61, 111, 126, 213, 59, 111, 126, 214, 56, 114, 125, 216, 59, 113, 127, 217, 47, 116, 126, 218, 61, 114, 127, 220, 49, 111, 128, 4, 189, 61, 112, 134, 190, 38, 104, 127, 193, 41, 102, 125, 196, 49, 110, 127, 198, 50, 109, 126, 199, 28, 101, 130, 200, 62, 108, 127, 50, 102, 127, 47, 102, 127, 202, 61, 108, 126, 203, 45, 102, 134, 204, 69, 111, 126, 57, 105, 125, 205, 38, 102, 125, 207, 33, 105, 125, 208, 60, 110, 126, 209, 45, 105, 125, 210, 62, 108, 126, 211, 26, 101, 129, 213, 59, 109, 127, 215, 57, 107, 126, 216, 59, 108, 126, 218, 52, 103, 125, 48, 99, 125, 220, 59, 106, 133, 4, 189, 45, 100, 127, 35, 100, 128, 42, 100, 134, 190, 42, 97, 126, 193, 56, 106, 134, 201, 33, 99, 125, 202, 35, 93, 128, 59, 107, 134, 208, 57, 108, 131, 209, 47, 99, 127, 211, 59, 111, 144, 215, 35, 100, 127, 219, 42, 101, 126, 4, 193, 47, 99, 126, 194, 63, 112, 138, 198, 35, 100, 128, 47, 99, 127, 202, 44, 95, 125, 203, 47, 93, 125, 205, 40, 102, 125, 207, 45, 98, 127, 209, 47, 102, 125, 212, 52, 105, 126, 213, 59, 109, 128, 40, 101, 127, 219, 37, 99, 128, 65, 107, 133, 220, 53, 100, 131, 4, 191, 47, 99, 127, 35, 100, 127, 196, 40, 100, 125, 40, 102, 126, 199, 49, 103, 125, 40, 102, 125, 204, 59, 110, 135, 207, 35, 100, 128, 40, 102, 127, 210, 40, 101, 125, 212, 47, 100, 125, 216, 49, 105, 125, 218, 47, 99, 125, 59, 106, 132, 4, 189, 35, 100, 126, 193, 42, 101, 126, 54, 105, 129, 38, 99, 125, 194, 47, 99, 125, 197, 42, 100, 125, 35, 100, 125, 199, 59, 107, 128, 201, 35, 101, 128, 47, 101, 126, 204, 40, 102, 125, 54, 108, 129, 205, 47, 101, 126, 210, 35, 102, 125, 59, 109, 127, 211, 47, 103, 125, 214, 42, 102, 125, 215, 47, 103, 125, 218, 30, 98, 125, 219, 46, 103, 125, 47, 100, 125, 4, 191, 33, 102, 125, 69, 110, 141, 192, 40, 103, 126, 49, 102, 125, 42, 98, 125, 195, 49, 104, 125, 198, 40, 103, 126, 53, 107, 125, 201, 49, 105, 125, 202, 35, 100, 125, 45, 99, 125, 203, 61, 109, 131, 49, 100, 125, 206, 47, 103, 125, 207, 42, 102, 125, 45, 98, 125, 210, 50, 108, 127, 47, 103, 125, 212, 47, 106, 125, 213, 50, 107, 125, 214, 47, 103, 126, 215, 47, 105, 125, 216, 50, 104, 127, 47, 107, 125, 217, 50, 105, 125, 219, 54, 111, 130, 47, 105, 125, 220, 50, 108, 130, 4, 191, 47, 105, 125, 192, 50, 107, 125, 193, 54, 108, 127, 47, 107, 125, 195, 42, 105, 126, 197, 47, 107, 125, 54, 107, 125, 198, 51, 108, 126, 199, 46, 106, 126, 200, 50, 108, 126, 201, 47, 108, 125, 202, 32, 108, 128]
In [ ]:
output_score = tokenizer.tokens_to_midi([generated_sequence])
output_score.dump_midi(f"symbolic_unconditioned.mid")
fs.midi_to_audio("symbolic_unconditioned.mid", "symbolic_unconditioned.wav")
display(Audio("symbolic_unconditioned.wav"))
/var/folders/2y/r1p8gt5d4vxd06pv8k49c87m0000gn/T/ipykernel_30668/72160659.py:1: UserWarning: miditok: The `tokens_to_midi` method had been renamed `decode`. It is now depreciated and will be removed in future updates.
  output_score = tokenizer.tokens_to_midi([generated_sequence])
FluidSynth runtime version 2.4.6
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file 'rnn.wav'..
Your browser does not support the audio element.

Symbolic, conditioned generation (harmonization)¶

In [30]:
# Define simple harmonization model with LSTM
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset cleass to prepare data for molody -> chord harmonization
class HarmonizationDataset(Dataset):
    def __init__(self, file_paths, tokenizer):
        self.data = []
        # Iterate through MIDI files, limit to 100 for speed
        for path in file_paths[:100]:  # limit to 100 for speed
            try:
                encodings = tokenizer.encode(path)
                ids = []
                for encoding in encodings:
                    ids.extend(encoding.ids)  # flatten all token segments into a single sequence
                # Slide over tokensized MIDI to generate melody-chord pairs
                for i in range(len(ids) - 16):
                    melody = ids[i:i+8] # First 8 tokens are melody
                    chords = ids[i+8:i+16] # Next 8 tokens are chords
                    self.data.append((melody, chords))
            except Exception as e:
                print(f"Skipping file {path} due to error: {e}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        melody, chords = self.data[idx]
        return torch.tensor(melody), torch.tensor(chords)

# Define the LSTM based harmonization model
class HarmonizationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        # Embedding layer transforms token IDs to dense vectors
        self.embed = nn.Embedding(vocab_size, embed_dim)
        # LSTM captures sequential patterns in melody and chord progression
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        # Linear layer maps LSTM output to chord token probabilities
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        output, _ = self.lstm(x)
        logits = self.fc(output)
        return logits
In [31]:
# Create dataset and loader using train_files
dataset = HarmonizationDataset(train_files, tokenizer)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model, loss, and optimizer
model = HarmonizationModel(tokenizer.vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Simple training loop
for epoch in range(7):
    model.train()
    train_losses = []

    for melody, chords in loader:
        preds = model(melody)
        loss = loss_fn(preds.view(-1, preds.shape[-1]), chords.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f"Epoch {epoch+1}/10 | Train Loss: {avg_train_loss:.4f}")
Epoch 1/10 | Train Loss: 3.7491
Epoch 2/10 | Train Loss: 3.6946
Epoch 3/10 | Train Loss: 3.6709
Epoch 4/10 | Train Loss: 3.6546
Epoch 5/10 | Train Loss: 3.6428
Epoch 6/10 | Train Loss: 3.6340
Epoch 7/10 | Train Loss: 3.6272
In [32]:
# Select melody sample
melody, _ = dataset[0]

# Use trained model to predict chords
model.eval()
with torch.no_grad():
    preds = model(melody.unsqueeze(0))  
    pred_ids = preds.argmax(dim=-1).squeeze().tolist()  # Get predicted chord token IDs
In [33]:
from miditok import TokSequence

# Combine melody and predicted chords into one list of IDs
combined = melody.tolist() + pred_ids

# Wrap in a TokSequence
seq = TokSequence(ids=combined)

# Decode into a ScoreTick
score = tokenizer.decode([seq])  # returns ScoreTick

# Dump the ScoreTick to a .mid file
score.dump_midi("symbolic_conditioned.mid")
fs.midi_to_audio("symbolic_conditioned.mid", "symbolic_conditioned.wav")
print("✅ Saved symbolic_conditioned.mid")
FluidSynth runtime version 2.4.6
Copyright (C) 2000-2025 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of Creative Technology Ltd.

Rendering audio to file 'symbolic_conditioned.wav'..
✅ Saved symbolic_conditioned.mid
In [37]:
display(Audio("symbolic_conditioned.wav"))
Your browser does not support the audio element.
In [35]:
display(Audio("symbolic_unconditioned.wav"))
Your browser does not support the audio element.
In [36]:
display(Audio("rnn.wav"))
Your browser does not support the audio element.